from math import sqrt
import itertools
import warnings
import numpy as np
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from prophet import Prophet
from prophet.diagnostics import cross_validation, cross_validation, performance_metrics
from prophet.plot import plot_plotly, plot_components_plotly
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import sts
plt.rcParams['figure.figsize']=(20,5)
pyo.init_notebook_mode()
# this is done mainly because of SARIMAX
warnings.filterwarnings("ignore")
def plot_forecast(data, forecast_samples, validation=True, CI=90):
"""Plot the observed time series alongside the forecast."""
plt.figure(figsize=(10, 4))
forecast_median = np.median(forecast_samples, axis=0)
num_steps = len(data)
num_steps_forecast = forecast_median.shape[-1]
plt.plot(np.arange(num_steps), data, lw=2, color='blue', linestyle='--', marker='o',
label='Observed beer production', alpha=0.7)
if validation: # we want to compare forecast with validation
forecast_steps = np.arange(num_steps_forecast)
else:
forecast_steps = np.arange(num_steps, num_steps+num_steps_forecast)
CI_interval = [(100 - CI)/2, 100 - (100 - CI)/2]
lower, upper = np.percentile(forecast_samples, CI_interval, axis=0)
plt.plot(forecast_steps, forecast_median, lw=2, ls='--', marker='o', color='orange',
label=str(CI) + '% Forecast Interval', alpha=0.7)
plt.fill_between(forecast_steps,
lower,
upper, color='orange', alpha=0.2)
if validation:
plt.xlim([0, num_steps_forecast])
else:
plt.xlim([0, num_steps+num_steps_forecast])
ymin, ymax = min(np.min(forecast_samples), np.min(data)), max(np.max(forecast_samples), np.max(data))
yrange = ymax-ymin
plt.title("{}".format(f'Observed time series with {num_steps_forecast} month Forecast'))
plt.xlabel('Month')
plt.ylabel('Monthly beer production')
plt.legend()
def plot_forecast_prophet(data, mean, upper, lower, validation=True, CI=90):
"""Plot the observed time series alongside the forecast."""
plt.figure(figsize=(10, 4))
num_steps = len(data)
num_steps_forecast = mean.shape[-1]
plt.plot(np.arange(num_steps), data, lw=2, color='blue', linestyle='--', marker='o',
label='Observed beer production', alpha=0.7)
if validation: # we want to compare forecast with validation
forecast_steps = np.arange(num_steps_forecast)
else:
forecast_steps = np.arange(num_steps, num_steps+num_steps_forecast)
plt.plot(forecast_steps, mean, lw=2, ls='--', marker='o', color='orange',
label=str(CI) + '% Forecast Interval', alpha=0.7)
plt.fill_between(forecast_steps,
lower,
upper, color='orange', alpha=0.2)
if validation:
plt.xlim([0, num_steps_forecast])
else:
plt.xlim([0, num_steps+num_steps_forecast])
ymin, ymax = min(np.min(mean), np.min(data)), max(np.max(mean), np.max(data))
yrange = ymax-ymin
plt.title("{}".format(f'Observed time series with {num_steps_forecast} month Forecast'))
plt.xlabel('Month')
plt.ylabel('Monthly beer production')
plt.legend()
ds = pd.read_csv('dataset.csv', names=['given_date', 'production'], header=0)
ds
| given_date | production | |
|---|---|---|
| 0 | 1956-01 | 93.2 |
| 1 | 1956-02 | 96.0 |
| 2 | 1956-03 | 95.2 |
| 3 | 1956-04 | 77.1 |
| 4 | 1956-05 | 70.9 |
| ... | ... | ... |
| 471 | 1995-04 | 127.0 |
| 472 | 1995-05 | 151.0 |
| 473 | 1995-06 | 130.0 |
| 474 | 1995-07 | 119.0 |
| 475 | 1995-08 | 153.0 |
476 rows × 2 columns
In this subsection we create various features starting from adding seasons
winter_months = [12, 1, 2]
spring_months = [3, 4, 5]
summer_months = [6, 7, 8]
autumn_months = [9, 10, 11]
ds['date'] = pd.to_datetime(ds['given_date'])
ds['month'] = ds['date'].dt.month
ds['year'] = ds['date'].dt.year
ds.loc[ds['month'].isin(winter_months),'season'] = 'Winter'
ds.loc[ds['month'].isin(spring_months),'season'] = 'Spring'
ds.loc[ds['month'].isin(summer_months),'season'] = 'Summer'
ds.loc[ds['month'].isin(autumn_months),'season'] = 'Autumn'
ds
| given_date | production | date | month | year | season | |
|---|---|---|---|---|---|---|
| 0 | 1956-01 | 93.2 | 1956-01-01 | 1 | 1956 | Winter |
| 1 | 1956-02 | 96.0 | 1956-02-01 | 2 | 1956 | Winter |
| 2 | 1956-03 | 95.2 | 1956-03-01 | 3 | 1956 | Spring |
| 3 | 1956-04 | 77.1 | 1956-04-01 | 4 | 1956 | Spring |
| 4 | 1956-05 | 70.9 | 1956-05-01 | 5 | 1956 | Spring |
| ... | ... | ... | ... | ... | ... | ... |
| 471 | 1995-04 | 127.0 | 1995-04-01 | 4 | 1995 | Spring |
| 472 | 1995-05 | 151.0 | 1995-05-01 | 5 | 1995 | Spring |
| 473 | 1995-06 | 130.0 | 1995-06-01 | 6 | 1995 | Summer |
| 474 | 1995-07 | 119.0 | 1995-07-01 | 7 | 1995 | Summer |
| 475 | 1995-08 | 153.0 | 1995-08-01 | 8 | 1995 | Summer |
476 rows × 6 columns
def describe_dataset(df: pd.DataFrame) -> pd.DataFrame:
"""
Function showing for each column of pandas data frame:
data type and counts of NAs, unique values or values given
"""
describe = df.describe().transpose()
describe['NA'] = df.isna().sum()
describe['unique'] = df.nunique()
describe['types'] = df.dtypes
describe['count'] = df.count()
ordered_columns = ['NA', 'unique', 'types', 'count', 'mean',
'std', 'min', '25%', '50%', '75%', 'max']
board = describe[ordered_columns]
return board.sort_values('NA', ascending=False)
describe_dataset(ds)
| NA | unique | types | count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| production | 0 | 383 | float64 | 476 | 136.395378 | 33.738725 | 64.8 | 112.9 | 139.15 | 158.825 | 217.8 |
| month | 0 | 12 | int64 | 476 | 6.466387 | 3.449016 | 1.0 | 3.0 | 6.00 | 9.000 | 12.0 |
| year | 0 | 40 | int64 | 476 | 1975.336134 | 11.464014 | 1956.0 | 1965.0 | 1975.00 | 1985.000 | 1995.0 |
We can see that our dataset does not contain any NaN', which is great since we don't need to interpolate any values
From the graph we can see that the time series is not stationary (mean is not time invariant, i.e. there exists trend )
fig = px.line(ds, x='date', y="production", hover_data=['season'])
fig.show()
Our data does consists of trends + seasonal effects + noise. \ Mathematically for additive model: $$ y(x(t)) = a_1*x_{trend}(t) + a_2*x_{seasonal}(t) + noise(t)$$ for multiplicative model: $$ y(x(t)) = a_1*x_{trend}(t) * a_2*x_{seasonal}(t) * noise(t)$$
for idx, model in enumerate(['additive', 'multiplicative']):
decomp = seasonal_decompose(ds['production'], period=12, model=model, extrapolate_trend='freq')
ds[f"{model}_production_seasonal"] = decomp.seasonal
ds[f"{model}_production_residual"] = decomp.resid
if idx == 0:
ds[f"production_trend"] = decomp.trend
fig = px.line(ds, x='date', y="production_trend", hover_data=['season'])
fig.show()
fig = px.line(ds, x='date', y="additive_production_seasonal", hover_data=['season'], title='')
fig.show()
fig = px.line(ds, x='date', y="multiplicative_production_seasonal", hover_data=['season'], title='')
fig.show()
fig = px.line(ds, x='date', y="additive_production_residual", hover_data=['season'])
fig.show()
fig = px.line(ds, x='date', y="multiplicative_production_residual", hover_data=['season'])
fig.show()
ds.loc[:, 'sin_month'] = np.sin(2*np.pi*ds.month/12)
ds.loc[:, 'cos_month'] = np.cos(2*np.pi*ds.month/12)
px.scatter(data_frame=ds, x='sin_month', y='cos_month', hover_data=['season'], color='season')
From our data analysis we saw that the trend of production is increasing until 1975, where it starts to oscilate. Our goal is to train correct model on the new data, so we are going to cutoff all data before 1975
ds = ds[ds['year'] >= 1975]
We are going to split our dataset into 2 parts - train and test set with distribution being 98% + and 2%
n = len(ds)
train_df = ds[0:int(n*0.98)]
test_df = ds[int(n*0.02):]
num_features = ds.shape[1]
num_cols = ['production', 'production_trend', 'additive_production_seasonal', 'multiplicative_production_seasonal', 'multiplicative_production_residual',
'additive_production_residual', 'prev_year_data', 'prev_month_data']
safe_cols = ['cos_month', 'sin_month']
In this subsection we are going to fit and cross-validate a multitude of models, which in the end of this notebook are going to be bootstrapped to create a combined prediction. We are going to use TimeSeries split with 5 folds.
Since our data not only contains trend, but also seasonal component we are not going to be using ARIMA model.
SARIMA model can be parametrized using 7 parameters (First 3 are same for ARIMA, while the last 4 account for the seasonality in our model)
Seasonal parameters:
In next subsection we are going to find the optimal hyperparameters for our model using hyperopt package.
# Bounded region of parameter space
space={'order_p': hp.choice("order_p", [0, 1, 2, 3, 4]),
'order_d': hp.choice ('order_d', [0, 1, 2, 3, 4]),
'order_q' : hp.choice('order_q', [0, 1, 2, 3, 4]),
'seasonal_order_p' : hp.choice('seasonal_order_p', [0, 1, 2, 3, 4]),
'seasonal_order_d' : hp.choice('seasonal_order_d', [0, 1, 2, 3, 4]),
'seasonal_order_q' : hp.choice('seasonal_order_q', [0, 1, 2, 3, 4]),
'seasonal_order_s' : 12,
}
def objective_sarimax(space, n_splits = 4):
tss = TimeSeriesSplit(n_splits=n_splits)
metric = 0
for train_index, val_index in tss.split(train_df):
train_dataset = train_df.iloc[train_index]
val_dataset = train_df.iloc[val_index]
model = sm.tsa.statespace.SARIMAX(endog=train_dataset['production'].values.astype('float64'),
order=(space['order_p'], space['order_d'],space['order_q']),
seasonal_order=(space['seasonal_order_p'], space['seasonal_order_d'],space['seasonal_order_q'], space['seasonal_order_s']),
enforce_stationarity=True, enforce_invertibility=True, initialization='approximate_diffuse')
result = model.fit()
prediction = result.get_forecast(val_dataset.shape[0]).predicted_mean
rmse = mean_squared_error(val_dataset['production'], prediction, squared=True)
metric += rmse/n_splits
return {'loss': metric, 'status': STATUS_OK}
trials = Trials()
# minimizing rmse metric (goodness of fit)
bh = fmin(fn = objective_sarimax,
space = space,
algo = tpe.suggest,
max_evals = 5,
trials = trials)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:03<00:00, 96.65s/trial, best loss: 490.3819805195452]
bh
{'order_d': 1,
'order_p': 4,
'order_q': 3,
'seasonal_order_d': 0,
'seasonal_order_p': 0,
'seasonal_order_q': 4}
model_sarimax = sm.tsa.statespace.SARIMAX(endog=train_df['production'].values.astype('float64'),
order=(bh['order_p'], bh['order_d'],bh['order_q']),
seasonal_order=(bh['seasonal_order_p'], bh['seasonal_order_d'],bh['seasonal_order_q'], 12),
enforce_stationarity=True, enforce_invertibility=True, initialization='approximate_diffuse')
result = model_sarimax.fit()
forecast = result.get_forecast(test_df.shape[0])
prediction = forecast.predicted_mean
plt.scatter(x= test_df.index, y=test_df['production'], label = 'unobserved data')
plt.scatter(x= test_df.index, y=prediction, label='predicted production')
plt.title('Evaluation of SARIMA model on testing dataset')
plt.legend()
plt.show()
Our model did not achieve the optimal performance, which is be due to the low amount of hyperparameter rounds. (time constranints)
model_sarimax = sm.tsa.statespace.SARIMAX(endog=ds['production'].values.astype('float64'),
order=(bh['order_p'], bh['order_d'],bh['order_q']),
seasonal_order=(bh['seasonal_order_p'], bh['seasonal_order_d'],bh['seasonal_order_q'], 12),
enforce_stationarity=True, enforce_invertibility=True, initialization='approximate_diffuse')
result = model_sarimax.fit()
forecast = result.get_forecast(16)
forecast_sarima = forecast.predicted_mean # forecasted 1996
In the next subsection we are going to evaluate our probability model on testing set
def build_model(observed_time_series):
trend = sts.LocalLinearTrend(observed_time_series=observed_time_series) # linear trend
seasonal_year = tfp.sts.Seasonal(
num_seasons=12, observed_time_series=observed_time_series, name='seasonal_year') # yearly seasonality
seasonal_quarter = tfp.sts.Seasonal(
num_seasons=3, observed_time_series=observed_time_series, name='seasonal_quarter') # quarter-wise seasonality
autoregressive = sts.Autoregressive(order=1, observed_time_series=observed_time_series) # autoregressive
model = sts.Sum([trend, seasonal_year, seasonal_quarter, autoregressive], observed_time_series=observed_time_series)
return model
beer_model = build_model(tf.Variable(train_df['production'].values.astype(np.float32)))
variational_posteriors = sts.build_factored_surrogate_posterior(model=beer_model)
WARNING:tensorflow:From c:\users\br077226\anaconda3\envs\time\lib\site-packages\tensorflow\python\ops\linalg\linear_operator_composition.py:181: LinearOperator.graph_parents (from tensorflow.python.ops.linalg.linear_operator) is deprecated and will be removed in a future version. Instructions for updating: Do not call `graph_parents`. WARNING:tensorflow:From c:\users\br077226\anaconda3\envs\time\lib\site-packages\tensorflow_probability\python\distributions\distribution.py:346: MultivariateNormalFullCovariance.__init__ (from tensorflow_probability.python.distributions.mvn_full_covariance) is deprecated and will be removed after 2019-12-01. Instructions for updating: `MultivariateNormalFullCovariance` is deprecated, use `MultivariateNormalTriL(loc=loc, scale_tril=tf.linalg.cholesky(covariance_matrix))` instead.
num_variational_steps = 500
optimizer = tf.optimizers.Adam(learning_rate=.01 ,beta_2=0.95, beta_1=0.95) # Could be tuned with hyperparameter optimization
# Using fit_surrogate_posterior to build and optimize the variational loss function.
def train():
elbo_loss_curve = tfp.vi.fit_surrogate_posterior(
target_log_prob_fn=beer_model.joint_log_prob(
observed_time_series=tf.Variable(train_df['production'].values.astype(np.float32))),
surrogate_posterior=variational_posteriors,
optimizer=optimizer,
num_steps=num_variational_steps)
return elbo_loss_curve
elbo_loss_curve = train()
plt.plot(elbo_loss_curve)
plt.show()
# Draw samples from the variational posterior.
q_samples_beer = variational_posteriors.sample(100)
WARNING:tensorflow:From c:\users\br077226\anaconda3\envs\time\lib\site-packages\tensorflow\python\ops\array_ops.py:5043: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version. Instructions for updating: The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
print("Inferred parameters:")
for param in beer_model.parameters:
print("{}: {} +- {}".format(param.name,
np.mean(q_samples_beer[param.name], axis=0),
np.std(q_samples_beer[param.name], axis=0)))
Inferred parameters: observation_noise_scale: 7.818804740905762 +- 0.8545153737068176 LocalLinearTrend/_level_scale: 0.3303487002849579 +- 0.4837746322154999 LocalLinearTrend/_slope_scale: 0.10804043710231781 +- 0.02412397600710392 seasonal_year/_drift_scale: 1.7744407653808594 +- 0.32054707407951355 seasonal_quarter/_drift_scale: 3.353459358215332 +- 1.0370733737945557 Autoregressive/_coefficients: [-0.19138293] +- [0.13953717] Autoregressive/_level_scale: 5.059778213500977 +- 1.1108851432800293
beer_forecast_dist = tfp.sts.forecast(
beer_model,
observed_time_series=tf.Variable(train_df['production'].values.astype(np.float32)),
parameter_samples=q_samples_beer,
num_steps_forecast=test_df.shape[0])
num_samples=50
beer_forecast_mean, beer_forecast_scale, beer_forecast_samples = (
beer_forecast_dist.mean()[..., 0],
beer_forecast_dist.stddev()[..., 0],
beer_forecast_dist.sample(num_samples)[..., 0])
Compare on the testing dataset
plot_forecast(test_df['production'], beer_forecast_samples)
beer_forecast_dist = tfp.sts.forecast(
beer_model,
observed_time_series=tf.Variable(ds['production'].values.astype(np.float32)),
parameter_samples=q_samples_beer,
num_steps_forecast=16)
num_samples=50
beer_forecast_mean, beer_forecast_scale, beer_forecast_samples = (
beer_forecast_dist.mean()[..., 0],
beer_forecast_dist.stddev()[..., 0],
beer_forecast_dist.sample(num_samples)[..., 0])
Due to nature of the model (probabilistic) we can see that with increase in time our upper and lower interval is increasing, while the prediction is relatively ok
We are going to use out of the box ready model created by facebook called prophet.
raw_data = pd.read_csv('dataset.csv', names=['given_date', 'y'], header=0)
raw_data['ds'] = pd.to_datetime(raw_data['given_date'])
raw_data['date'] = pd.to_datetime(raw_data['given_date'])
raw_data['month'] = raw_data['date'].dt.month
raw_data['year'] = raw_data['date'].dt.year
ds_p = raw_data[raw_data['year'] >= 1975]
ds_p = ds_p[['ds', 'y']]
ds_p
| ds | y | |
|---|---|---|
| 228 | 1975-01-01 | 161.4 |
| 229 | 1975-02-01 | 169.4 |
| 230 | 1975-03-01 | 168.8 |
| 231 | 1975-04-01 | 158.1 |
| 232 | 1975-05-01 | 158.5 |
| ... | ... | ... |
| 471 | 1995-04-01 | 127.0 |
| 472 | 1995-05-01 | 151.0 |
| 473 | 1995-06-01 | 130.0 |
| 474 | 1995-07-01 | 119.0 |
| 475 | 1995-08-01 | 153.0 |
248 rows × 2 columns
train_df_p = ds_p[0:int(n*0.98)]
test_df_p = ds_p[int(n*0.02):]
param_grid = {
'changepoint_prior_scale': [0.001, 0.01],
'seasonality_prior_scale': [0.01, 0.1],
}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = [] # Store the RMSEs for each params here
# Use cross validation to evaluate all parameters
for params in all_params:
model = Prophet(**params).fit(train_df_p) # Fit model with given params
ds_cv = cross_validation(model, horizon="12 M", parallel="processes")
df_p = performance_metrics(ds_cv, rolling_window=1)
rmses.append(df_p['rmse'].values[0])
# Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print(tuning_results)
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:prophet:Making 230 forecasts with cutoffs between 1976-01-31 23:48:00 and 1995-02-28 23:48:00 INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000014248ECD370> INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:prophet:Making 230 forecasts with cutoffs between 1976-01-31 23:48:00 and 1995-02-28 23:48:00 INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x000001422F0F9FD0> INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:prophet:Making 230 forecasts with cutoffs between 1976-01-31 23:48:00 and 1995-02-28 23:48:00 INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x000001423F3CF760> INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. INFO:prophet:Making 230 forecasts with cutoffs between 1976-01-31 23:48:00 and 1995-02-28 23:48:00 INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000014248ECD040>
changepoint_prior_scale seasonality_prior_scale rmse 0 0.001 0.01 15.081731 1 0.001 0.10 13.717966 2 0.010 0.01 14.834582 3 0.010 0.10 12.958054
future = model.make_future_dataframe(freq='M', periods=test_df_p.shape[0]).tail(test_df_p.shape[0])
future.shape
(244, 1)
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
| ds | yhat | yhat_lower | yhat_upper | |
|---|---|---|---|---|
| 239 | 2015-02-28 | 145.424643 | 131.531512 | 158.848533 |
| 240 | 2015-03-31 | 129.833379 | 115.626468 | 144.779782 |
| 241 | 2015-04-30 | 123.381825 | 107.800741 | 136.709257 |
| 242 | 2015-05-31 | 111.165355 | 96.924564 | 126.166010 |
| 243 | 2015-06-30 | 120.993293 | 107.101118 | 135.451232 |
best_params = all_params[np.argmin(rmses)]
print(best_params)
{'changepoint_prior_scale': 0.01, 'seasonality_prior_scale': 0.1}
plot_forecast_prophet(test_df_p['y'], forecast['yhat'].values,forecast['yhat_lower'].values, forecast['yhat_upper'].values)
fig2 = model.plot_components(forecast)
plot_components_plotly(model, forecast)
plot_plotly(model, forecast)
best_params = all_params[np.argmin(rmses)]
print(best_params)
model = Prophet(**best_params).fit(ds_p)
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
{'changepoint_prior_scale': 0.01, 'seasonality_prior_scale': 0.1}
future = model.make_future_dataframe(freq='M', periods=ds_p.shape[0])
future = future[(future['ds'] >= '1996-01-01') & (future['ds'] <= '1996-12-31') ]
forecast_prophet = model.predict(future)
forecast_prophet[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
| ds | yhat | yhat_lower | yhat_upper | |
|---|---|---|---|---|
| 7 | 1996-08-31 | 140.010166 | 125.678333 | 155.351909 |
| 8 | 1996-09-30 | 165.830330 | 151.134219 | 179.967968 |
| 9 | 1996-10-31 | 177.056821 | 162.150742 | 193.080866 |
| 10 | 1996-11-30 | 189.757650 | 174.548378 | 204.968193 |
| 11 | 1996-12-31 | 153.038899 | 136.850248 | 167.607282 |
We are going to train the models on the whole dataset (except for the data points before 1975) and forecast the production for the year 1996, while averaging predictions from our 3 models.
Since our data ends at 1995-08 we are going to predict 16 months before to cover the whole year 1996.
predictions_ds = pd.DataFrame()
predictions_ds['prophet'] = forecast_prophet['yhat']
predictions_ds['tf_prob'] = beer_forecast_mean[4:].numpy() # we take last 12 months
predictions_ds['sarima'] = forecast_sarima[4:] # we take last 12 months
predictions_ds['means'] = predictions_ds.mean(axis=1)
predictions_ds['std'] = predictions_ds.std(axis=1)
predictions_ds['date'] = forecast_prophet['ds']
predictions_ds
| prophet | tf_prob | sarima | means | std | date | |
|---|---|---|---|---|---|---|
| 0 | 144.298799 | 138.039032 | 159.088593 | 147.142142 | 8.825511 | 1996-01-31 |
| 1 | 158.407709 | 143.891037 | 141.039626 | 147.779457 | 7.604930 | 1996-02-29 |
| 2 | 142.816021 | 155.147552 | 146.137402 | 148.033658 | 5.209831 | 1996-03-31 |
| 3 | 138.281803 | 129.143173 | 142.037971 | 136.487649 | 5.414991 | 1996-04-30 |
| 4 | 125.458043 | 143.975983 | 151.307382 | 140.247136 | 10.877356 | 1996-05-31 |
| 5 | 134.473588 | 126.633301 | 132.559085 | 131.221991 | 3.337503 | 1996-06-30 |
| 6 | 141.104922 | 123.432678 | 137.082697 | 133.873432 | 7.563138 | 1996-07-31 |
| 7 | 140.010166 | 147.307739 | 146.575228 | 144.631044 | 3.281111 | 1996-08-31 |
| 8 | 165.830330 | 139.074570 | 140.940671 | 148.615190 | 12.196758 | 1996-09-30 |
| 9 | 177.056821 | 155.704239 | 173.235348 | 168.665469 | 9.296811 | 1996-10-31 |
| 10 | 189.757650 | 187.279236 | 173.555898 | 183.530928 | 7.125613 | 1996-11-30 |
| 11 | 153.038899 | 188.392609 | 158.471725 | 166.634411 | 15.544416 | 1996-12-31 |
plt.plot(predictions_ds.index, predictions_ds['means'], lw=2, ls='--', marker='o', color='blue',
label='Bootstrapped forecasting', alpha=0.7)
plt.plot(predictions_ds.index, predictions_ds['prophet'], lw=2, ls='--', marker='o', color='red',
label='prophet forecasting', alpha=0.7)
plt.plot(predictions_ds.index, predictions_ds['tf_prob'], lw=2, ls='--', marker='o', color='green',
label='tf_prob forecasting', alpha=0.7)
plt.plot(predictions_ds.index, predictions_ds['sarima'], lw=2, ls='--', marker='o', color='black',
label='sarima forecasting', alpha=0.7)
plt.fill_between(predictions_ds.index,
predictions_ds['means'] - 2*predictions_ds['std'],
predictions_ds['means'] + 2*predictions_ds['std'], color='orange', alpha=0.2)
plt.title('Forecast for 1996 of beer production using 3 models')
plt.xlabel('Months')
plt.ylabel('Beer production')
plt.legend()
plt.show()
Performance of our models correlates with ideas we have found out in our EDA (higher production during end of Autumn and Winter). This could mean, that we have succesfully forecasted beer production for the year 1996